{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "hmnrwuHZHxss",
    "outputId": "3896a25b-0eeb-4cd5-9d26-6eba6954fe53"
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "\n",
    "import pandas as pd\n",
    "from datasets import load_dataset\n",
    "\n",
    "ds = load_dataset(\"DenCT/codeforces-problems-7k\", split=\"train\")\n",
    "print(ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "CODEFORCES_TEMPLATE = \"\"\"\n",
    "{description}\n",
    "\n",
    "Input Format:\n",
    "{input_format}\n",
    "\n",
    "Output Format:\n",
    "{output_format}\n",
    "\n",
    "Tags:\n",
    "{tags}\n",
    "\n",
    "Time Limit: {time_limit} ms\n",
    "Memory Limit: {memory_limit} MB\n",
    "\"\"\"\n",
    "\n",
    "CODEFORCES_TEMPLATE_NO_LIMIT = \"\"\"\n",
    "{description}\n",
    "\n",
    "Input Format:\n",
    "{input_format}\n",
    "\n",
    "Output Format:\n",
    "{output_format}\n",
    "\n",
    "Tags:\n",
    "{tags}\n",
    "\n",
    "Demo input: {demo_input}\n",
    "Demo output: {demo_output}\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "id": "311OF4w_K4EJ"
   },
   "outputs": [],
   "source": [
    "# func. to extract difficulty rating from tags\n",
    "def extract_difficulty(tags):\n",
    "    if pd.isnull(tags):  # Handles when tags are null\n",
    "        return 0\n",
    "    for tag in tags.split(\",\"):\n",
    "        if \"*\" in tag:  # Difficulty is marked with a '*' symbol\n",
    "            try:\n",
    "                return int(tag.replace(\"*\", \"\"))\n",
    "            except ValueError:\n",
    "                continue\n",
    "    return 0  # Default difficulty"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "U2gCVF4ULG9l"
   },
   "outputs": [],
   "source": [
    "dataset = []\n",
    "for entry in ds:\n",
    "    tests = entry[\"test_cases\"]\n",
    "    if len(tests) <= 1:\n",
    "        continue\n",
    "    new_entry = {\n",
    "        \"problem\": CODEFORCES_TEMPLATE_NO_LIMIT.format(\n",
    "            description=entry[\"problem-description\"],\n",
    "            input_format=entry[\"input-specification\"],\n",
    "            output_format=entry[\"output-specification\"],\n",
    "            tags=entry[\"tags\"],\n",
    "            demo_input=entry[\"demo-input\"],\n",
    "            demo_output=entry[\"demo-output\"],\n",
    "        ),\n",
    "        \"tests\": tests,\n",
    "    }\n",
    "    dataset.append(new_entry)\n",
    "\n",
    "print(f\"Train dataset size: {len(dataset)}\")\n",
    "\n",
    "output_dir = os.path.abspath(\"../../train/code\")\n",
    "output_file = os.path.join(output_dir, \"codeforces.json\")\n",
    "\n",
    "with open(output_file, \"w\") as f:\n",
    "    json.dump(dataset, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds = load_dataset(\"Qwen/CodeElo\", split=\"test\")\n",
    "print(ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_test_cases(raw_cases):\n",
    "    # Examples field: https://huggingface.co/datasets/Qwen/CodeElo\n",
    "    # The first element is the full input string, the second is the full output string\n",
    "    formatted_cases = []\n",
    "    for rc in raw_cases:\n",
    "        input_case = rc[0]  # Keep input as a single string\n",
    "        output_case = rc[1]  # Keep output as a single string\n",
    "        # Structure the test cases\n",
    "        formatted_cases.append({\"input\": input_case, \"output\": output_case})\n",
    "    return formatted_cases\n",
    "\n",
    "\n",
    "dataset = []\n",
    "contest_ids = []\n",
    "for entry in ds:\n",
    "    tests = process_test_cases(entry[\"examples\"])\n",
    "    if len(tests) == 0:\n",
    "        continue\n",
    "    new_entry = {\n",
    "        \"problem\": CODEFORCES_TEMPLATE.format(\n",
    "            description=entry[\"description\"],\n",
    "            input_format=entry[\"input\"],\n",
    "            output_format=entry[\"output\"],\n",
    "            tags=entry[\"tags\"],\n",
    "            time_limit=entry[\"time_limit_ms\"],\n",
    "            memory_limit=entry[\"memory_limit_mb\"],\n",
    "        ),\n",
    "        \"tests\": tests,\n",
    "    }\n",
    "    dataset.append(new_entry)\n",
    "\n",
    "    contest_ids.append(entry[\"problem_id\"])\n",
    "\n",
    "print(f\"Test dataset size: {len(dataset)}\")\n",
    "\n",
    "output_dir = os.path.abspath(\"../../test/code\")\n",
    "output_file = os.path.join(output_dir, \"codeforces.json\")\n",
    "\n",
    "with open(output_file, \"w\") as f:\n",
    "    json.dump(dataset, f, indent=4)\n",
    "\n",
    "metadata_file = os.path.join(output_dir, \"metadata_cf.json\")\n",
    "\n",
    "with open(metadata_file, \"w\") as f:\n",
    "    json.dump(contest_ids, f, indent=4)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "ro1",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
